{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('glove.6B.200d.txt','r')\n",
    "dictionary = {}\n",
    "vectors = []\n",
    "for no, line in enumerate(f):\n",
    "    splitLine = line.split()\n",
    "    word = splitLine[0]\n",
    "    dictionary[word] = no\n",
    "    embedding = np.array([float(val) for val in splitLine[1:]])\n",
    "    vectors.append(embedding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = np.array(vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "rev_dictionary = {v:k for k, v in dictionary.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self):\n",
    "        self._embedding = tf.convert_to_tensor(vectors, dtype = tf.float32)\n",
    "        self.X = tf.placeholder(\n",
    "                tf.float32, [None, vectors.shape[1]]\n",
    "        )\n",
    "        normed_embedding = tf.nn.l2_normalize(self._embedding, axis = 1)\n",
    "        normed_array = tf.nn.l2_normalize(self.X, axis = 1)\n",
    "        self.cosine_similarity = tf.matmul(\n",
    "            normed_array, tf.transpose(normed_embedding, [1, 0])\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model()\n",
    "sess = tf.InteractiveSession()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "string_positive = 'i really love to eat chicken and meat'\n",
    "string_negative = 'i really hate you and i do not want to see you again'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def augmentation(string, threshold = 0.5, count = 5, k = 8):\n",
    "    string = string.split()\n",
    "    selected = []\n",
    "    while not len(selected):\n",
    "        selected = [(no, w) for no, w in enumerate(string) if random.random() > threshold]\n",
    "    indices, words = [i[0] for i in selected], [i[1] for i in selected]\n",
    "    \n",
    "    batches = vectors[[dictionary[w] for w in words]]\n",
    "    top_k = tf.nn.top_k(model.cosine_similarity, k = k)\n",
    "    results = sess.run(top_k, feed_dict = {model.X: batches})\n",
    "    words = []\n",
    "    for result in results.indices:\n",
    "        words.append([rev_dictionary[i] for i in result])\n",
    "    augmented = []\n",
    "    for i in range(count):\n",
    "        string_ = string[:]\n",
    "        for no in range(len(words)):\n",
    "            index = random.randint(0, len(words[no]) - 1)\n",
    "            string_[indices[no]] = words[no][index]\n",
    "        augmented.append(' '.join(string_))\n",
    "    return augmented   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.07 s, sys: 1.59 s, total: 5.66 s\n",
      "Wall time: 5.68 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['i thing love to eat pork also chicken',\n",
       " 'i really love to eat fried well cooked',\n",
       " 'i things love to eat meat with beef',\n",
       " 'i thing love to eat roasted well chicken',\n",
       " 'i something love to eat cooked , beef']"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "augmentation(string_positive)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.96 s, sys: 1.78 s, total: 5.73 s\n",
      "Wall time: 5.75 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['i really hate you also i know not want make see i again',\n",
       " \"i really hate you while i n't not want help see 'll again\",\n",
       " \"i really hate you . i not not want take see 'll again\",\n",
       " 'i really hate you well i know not want able see you again',\n",
       " \"i really hate you , i does not want could see 'll again\"]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "augmentation(string_negative)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
